www.gusucode.com > wxApp PHP版微信小程序CMS系统 v1.0PHP源码程序 > wxApp PHP版微信小程序CMS系统 v1.0/wxAppCMS_v1.0.0/wxAppCMS_v1.0.0/app/spider/spider_urls.class.php
<?php /** * iCMS - i Content Management System * Copyright (c) 2007-2017 iCMSdev.com. All rights reserved. * * @author icmsdev <master@icmsdev.com> * @site https://www.icmsdev.com * @licence https://www.icmsdev.com/LICENSE.html */ defined('iPHP') OR exit('What are you doing?'); class spider_urls { public static $urls = null; public static function crawl($work = NULL,$pid = NULL,$_rid = NULL,$_urls=null,$callback=null) { @set_time_limit(0); $pid === NULL && $pid = spider::$pid; if ($pid) { $project = spider::project($pid); $cid = $project['cid']; $rid = $project['rid']; $prule_list_url = $project['list_url']; $lastupdate = $project['lastupdate']; } else { $cid = spider::$cid; $rid = spider::$rid; } if($_rid !== NULL) $rid = $_rid; if($work=='shell'){ $lastupdate = $project['lastupdate']; if($project['psleep']){ if(time()-$lastupdate<$project['psleep']){ echo '采集方案['.$pid."]:".format_date($lastupdate)."刚采集过了,请".($project['psleep']/3600)."小时后在继续采集\n"; return; } } echo "\033[32m开始采集方案[".$pid."] 采集规则[".$rid."]\033[0m\n"; } $ruleA = spider::rule($rid); $rule = $ruleA['rule']; $urls = $rule['list_urls']; $project['urls'] && $urls = $project['urls']; spider_urls::$urls && $urls = spider_urls::$urls; $_urls && $urls = $_urls; $urlsArray = explode("\n", $urls); $urlsArray = array_filter($urlsArray); $_urlsArray = $urlsArray; $urlsList = array(); if($work=='shell'){ // echo "$urls\n"; print_r($urlsArray); } foreach ($_urlsArray AS $_key => $_url) { $_url = trim($_url); if(empty($_url)){ continue; } $_url = htmlspecialchars_decode($_url); $_urlsList = array(); /** * RULE@rid@url * url使用[rid]规则采集并返回列表结果 */ if(strpos($_url, 'RULE@')!==false){ list($___s,$_rid,$_urls) = explode('@', $_url); if (spider::$ruleTest) { print_r('<b>使用[rid:'.$_rid.']规则抓取列表</b>:'.$_urls); echo "<hr />"; } $_urlsList = spider_urls::crawl($work,false,$_rid,$_urls,'CALLBACK@URL'); $urlsList = array_merge($urlsList,$_urlsList); unset($urlsArray[$_key]); }else{ preg_match('|.*<(.*)>.*|is',$_url, $_matches); if($_matches){ if(strpos($_matches[1], 'DATE:')!==false){ list($type,$format) = explode(':',$_matches[1]); $urlsArray[$_key] = str_replace('<'.$_matches[1].'>', date($format),trim($_matches[0])); }else{ list($format,$begin,$num,$step,$zeroize,$reverse) = explode(',',$_matches[1]); $url = str_replace($_matches[1], '*',trim($_matches[0])); $_urlsList = spider_tools::mkurls($url,$format,$begin,$num,$step,$zeroize,$reverse); unset($urlsArray[$_key]); $urlsList = array_merge($urlsList,$_urlsList); } } } } $urlsList && $urlsArray = array_merge($urlsArray,$urlsList); unset($_urlsArray,$_key,$_url,$_matches,$_urlsList,$urlsList); $urlsArray = array_filter($urlsArray); $urlsArray = array_unique($urlsArray); // spider::$useragent = $rule['user_agent']; // spider::$encoding = $rule['curl']['encoding']; // spider::$referer = $rule['curl']['referer']; // spider::$charset = $rule['charset']; if(empty($urlsArray)){ if($work=='shell'){ echo spider::errorlog("采集列表为空!请填写!\n",$url,'urls.empty',array('pid'=>$pid,'sid'=>$sid,'rid'=>$rid)); return false; } iUI::alert('采集列表为空!请填写!', 'js:parent.window.iCMS_MODAL.destroy();'); } // if(spider::$ruleTest){ // echo "<pre>"; // print_r(iSecurity::escapeStr($project)); // print_r(iSecurity::escapeStr($rule)); // echo "</pre>"; // echo "<hr />"; // } if($rule['mode']=="2"){ iPHP::vendor('phpQuery'); spider::$ruleTest && $_GET['pq_debug'] && phpQuery::$debug =1; } $pubArray = array(); $pubCount = array(); $pubAllCount = array(); spider::$curl_proxy = $rule['proxy']; spider::$urlslast = null; if (spider::$ruleTest) { echo '<b>列表总:</b>'.count($urlsArray) . "条<br />"; echo '<pre>'; print_r($urlsArray); echo '</pre>'; $urlsArray = array(reset($urlsArray)); echo '<b>测试第一条</b><br />'; } foreach ($urlsArray AS $key => $url) { $url = trim($url); spider::$urlslast = $url; if($work=='shell'){ echo '开始采集列表:'.$url."\n"; } if (spider::$ruleTest) { echo '<b>抓取列表:</b>'.$url . "<br />"; } $html = spider_tools::remote($url); if(empty($html)){ $msg = "采集列表内容为空!\n"; $msg.= var_export(spider_tools::$curl_info,true); echo spider::errorlog($msg,$url,'url.empty',array('pid'=>$pid,'sid'=>$sid,'rid'=>$rid)); continue; } $rule['list_urls_format'] && $html = spider_tools::dataClean($rule['list_urls_format'], $html); if($rule['mode']=="2"){ $doc = phpQuery::newDocumentHTML($html,'UTF-8'); $list_area = $doc[trim($rule['list_area_rule'])]; // if(strpos($rule['list_area_format'], 'DOM::')!==false){ // $list_area = spider_tools::dataClean($rule['list_area_format'], $list_area); // } if($rule['list_area_format']){ $list_area_format = trim($rule['list_area_format']); if(strpos($list_area_format, 'ARRAY::')!==false){ $list_area_format = str_replace('ARRAY::', '', $list_area_format); $lists = array(); foreach ($list_area as $la_key => $la) { $lists[] = phpQuery::pq($list_area_format,$la); } }else{ $lists = phpQuery::pq($list_area_format,$list_area); } }else{ $lists = $list_area; } // $lists = $list_area; //echo 'list:getDocumentID:'.$lists->getDocumentID()."\n"; }elseif($rule['mode']=="3"){ $list_area = json_decode($html,true); if (spider::$ruleTest && is_null($list_area)) { echo '<b>JSON ERROR:'.json_last_error_msg().'</b>'; echo "<hr />"; } if($rule['list_area_rule']){ $list_area_rule = explode('->', $rule['list_area_rule']); $level = 0; $lists = spider_tools::array_filter_key($list_area,$list_area_rule,$level); }else{ $lists = $list_area; } }else{ $list_area_rule = spider_tools::pregTag($rule['list_area_rule']); if ($list_area_rule && $rule['list_area_rule']!='<%content%>') { preg_match('|' . $list_area_rule . '|is', $html, $matches); $list_area = $matches['content']; } else { $list_area = $html; } $html = null; unset($html); if (spider::$ruleTest) { echo iSecurity::escapeStr($rule['list_area_rule']); echo "<hr />"; } if ($rule['list_area_format']) { $list_area = spider_tools::dataClean($rule['list_area_format'], $list_area); } preg_match_all('|' . spider_tools::pregTag($rule['list_url_rule']) . '|is', $list_area, $lists, PREG_SET_ORDER); if ($rule['sort'] == "1") { //arsort($lists); } elseif ($rule['sort'] == "2") { asort($lists); } elseif ($rule['sort'] == "3") { shuffle($lists); } } if (spider::$ruleTest) { echo '<b>列表区域规则:</b>'.iSecurity::escapeStr($rule['list_area_rule']); echo "<hr />"; echo '<b>列表区域抓取结果:</b><div style="max-height:300px;overflow-y: scroll;">'; if(is_array($list_area)){ echo "<pre>"; var_export($list_area); echo "</pre>"; }else{ echo iSecurity::escapeStr($list_area); } echo '</div>'; echo "<hr />"; echo '<b>列表链接规则:</b>'.iSecurity::escapeStr($rule['list_url_rule']); echo "<hr />"; echo '<b>网址合成规则:</b>'.iSecurity::escapeStr($rule['list_url']); echo "<hr />"; } $list_area = null; unset($list_area); if($prule_list_url){ $rule['list_url'] = $prule_list_url; } $urlsData = self::lists_item_data($lists,$rule,$url); if (spider::$callback['urls'] && is_callable(spider::$callback['urls'])) { $urlsData = call_user_func_array(spider::$callback['urls'],array($urlsData,$url)); $urlsData['work'] && $work = $urlsData['work']; } //PID@xx 返回URL列表 if($callback=='CALLBACK@URL'){ $cbListUrl = array(); foreach ($urlsData AS $lkey => $value) { if($value['url']===false){ continue; } // if(spider::checker($work)===true){ $cbListUrl[] = $value['url']; // } } return $cbListUrl; } if($work=="WEB@MANUAL"){ $listsArray[$url] = $urlsData; } if($work=="shell"){ $pubCount[$url]['count'] = count($lists); $pubAllCount['count']+=$pubCount[$url]['count']; echo "开始采集:".$url." 列表 ".$pubCount[$url]['count']."条记录\n"; if(empty($pubCount[$url]['count'])){ echo spider::errorlog("采集列表记录为0!\n",$url,'url.zero',array('pid'=>$pid,'sid'=>$sid,'rid'=>$rid)); } foreach ($urlsData AS $lkey => $value) { spider::$title = $value['title']; spider::$url = $value['url']; if(spider::$url===false){ continue; } $hash = md5(spider::$url); echo "\033[32m开始采集...\033[0m\n"; echo "\033[36mtitle:\033[0m".spider::$title."\n"; echo "\033[36murl:\033[0m".spider::$url."\n"; spider::$rid = $rid; $checker = spider::checker($work,$pid,spider::$url,spider::$title); if($checker===true){ $wait = 3; $wait_start = time(); $callback = spider::publish("shell"); if ($callback['code'] == "1001") { $pubCount[$url]['success']++; $pubAllCount['success']++; $wait+= time()-$wait_start; echo "\033[32m采集完成并发布成功".str_repeat('.',$wait)."√\033[0m\n"; if($project['sleep']){ if($rule['mode']!="2"){ unset($lists[$lkey]); } gc_collect_cycles(); $usleep = $project['sleep']*1000; echo "\033[31m暂停".($project['sleep']/1000)."秒后继续\033[0m\n\n"; usleep($usleep); //1000000 = 1s }else{ //sleep(1); } }else{ $pubCount[$url]['error']++; $pubAllCount['error']++; echo "error\n\n"; continue; } } $pubCount[$url]['published']++; $pubAllCount['published']++; } if($rule['mode']=="2"){ phpQuery::unloadDocuments($doc->getDocumentID()); }else{ unset($lists); } } if($work=="WEB@AUTO"||$work=='DATA@RULE'){ spider::$spider_url_ids = array(); foreach ($urlsData AS $lkey => $value) { spider::$title = $value['title']; spider::$url = $value['url']; if(spider::$url===false){ continue; } $hash = md5(spider::$url); if (spider::$ruleTest) { echo '<b>列表抓取结果:</b>'.$lkey.'<br />'; echo spider::$title . ' (<a href="' . APP_URI . '&do=testdata'. '&url=' . urlencode(spider::$url) . '&rid=' . $rid . '&pid=' . $pid . '&title=' . urlencode(spider::$title) . '" target="_blank">测试内容规则</a>) <br />'; echo spider::$url . "<br />"; echo $hash . "<br />"; unset($value['title'],$value['url']); if($value){ echo '<b>其它采集结果:</b>'; echo '<pre>'; var_dump(array_map('htmlspecialchars', $value)); echo '</pre>'; } echo "<hr />"; } else { if(spider::checker($work,$pid,spider::$url,spider::$title)===true||spider::$dataTest){ $suData = array( 'sid' => 0, 'url' => spider::$url,'title' => spider::$title, 'cid' => $cid,'rid' => $rid,'pid' => $pid, 'hash' => $hash ); switch ($work) { case 'DATA@RULE': $contentArray[$lkey] = spider_data::crawl($pid,$rid,spider::$url,spider::$title); // $contentArray[$lkey] = spider_urls::crawl($work,$_pid); unset($suData['sid']); $suData['title'] = addslashes($suData['title']); $suData+= array( 'addtime' => time(), 'status' => '2','publish' => '2', 'indexid' => '0','pubdate' => '0' ); spider::$dataTest OR $suid = iDB::insert('spider_url',$suData); spider::$spider_url_ids[$lkey] = $suid; break; case 'WEB@AUTO': $pubArray[] = $suData; break; } } } } } } $lists = null; unset($lists); gc_collect_cycles(); switch ($work) { case 'WEB@AUTO': return $pubArray; break; case 'DATA@RULE': return $contentArray; break; case 'WEB@MANUAL': return array( 'cid' => $cid, 'rid' => $rid, 'pid' => $pid, 'sid' => $sid, 'work' => $work, 'rule' => $rule, 'listsArray' => $listsArray ); break; case "shell": echo "采集数据统结果:\n"; print_r($pubCount); print_r($pubAllCount); echo "全部采集完成....\n"; iDB::update('spider_project',array('lastupdate'=>time()),array('id'=>$pid)); break; } } public static function lists_item_data($lists,$rule,$url){ $array = array(); if($lists)foreach ($lists AS $lkey => $row) { $cache = array(); $data = spider_tools::listItemData($row,$rule,$url); if($data)foreach ($data as $key => $value) { if(is_numeric($key)|| strpos($key, 'var_')!==false){ unset($data[$key]); } if(strpos($key, 'var_')===false && $key!='title' && $key!='url'){ $cache[$key] = $value; } } $data['url'] && $cache && spider_tools::listItemCache($data['url'],$cache); $data && $array[$lkey] = $data; } return $array; } }